c++ web_spideer winsock实现

来源:互联网 发布:音悦台mac官方下载 编辑:程序博客网 时间:2024/05/13 07:10

写了一个网络爬虫,可以抓取网上的图片。

需要给定初始网站即可。

在vs2010中编译通过。

需要使用多字节字符集进行编译,

vs2010默认的是Unicode字符集。

编译后,运行即可,有惊喜哦!!!


[cpp] view plaincopyprint?在CODE上查看代码片派生到我的代码片
  1. //#include <Windows.h> 
  2. #include <string> 
  3. #include <iostream> 
  4. #include <fstream> 
  5. #include <vector> 
  6. #include "winsock2.h" 
  7. #include <time.h> 
  8. #include <queue> 
  9. #include <hash_set> 
  10.  
  11.  
  12. #pragma comment(lib, "ws2_32.lib")  
  13. using namespace std; 
  14.  
  15. #define DEFAULT_PAGE_BUF_SIZE 1048576 
  16.  
  17. queue<string> hrefUrl; 
  18. hash_set<string> visitedUrl; 
  19. hash_set<string> visitedImg; 
  20. int depth=0; 
  21. int g_ImgCnt=1; 
  22.  
  23. //解析URL,解析出主机名,资源名 
  24. bool ParseURL( const string & url, string & host, string & resource){ 
  25.     const char * pos = strstr( url.c_str(),"http://" ); 
  26.     if( pos==NULL ) pos = url.c_str(); 
  27.     else pos += strlen("http://"); 
  28.     if( strstr( pos, "/")==0 ) 
  29.         return false
  30.     char pHost[100]; 
  31.     char pResource[200]; 
  32.     sscanf( pos, "%[^/]%s", pHost, pResource ); 
  33.     host = pHost; 
  34.     resource = pResource; 
  35.     return true
  36.  
  37. //使用Get请求,得到响应 
  38. bool GetHttpResponse( const string & url, char * &response,int &bytesRead ){ 
  39.     string host, resource; 
  40.     if(!ParseURL( url, host, resource )){ 
  41.         cout << "Can not parse the url"<<endl; 
  42.         return false
  43.     } 
  44.      
  45.     //建立socket 
  46.     struct hostent * hp= gethostbyname( host.c_str() ); 
  47.     if( hp==NULL ){ 
  48.         cout<< "Can not find host address"<<endl; 
  49.         return false
  50.     } 
  51.  
  52.     SOCKET sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP); 
  53.     if( sock == -1 || sock == -2 ){ 
  54.         cout << "Can not create sock."<<endl; 
  55.         return false
  56.     } 
  57.  
  58.     //建立服务器地址 
  59.     SOCKADDR_IN sa; 
  60.     sa.sin_family = AF_INET; 
  61.     sa.sin_port = htons( 80 ); 
  62.     //char addr[5]; 
  63.     //memcpy( addr, hp->h_addr, 4 ); 
  64.     //sa.sin_addr.s_addr = inet_addr(hp->h_addr); 
  65.     memcpy( &sa.sin_addr, hp->h_addr, 4 ); 
  66.  
  67.     //建立连接 
  68.     if( 0!= connect( sock, (SOCKADDR*)&sa,sizeof(sa) ) ){ 
  69.         cout << "Can not connect: "<< url <<endl; 
  70.         closesocket(sock); 
  71.         return false
  72.     }; 
  73.  
  74.     //准备发送数据 
  75.     string request = "GET " + resource +" HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n"
  76.  
  77.     //发送数据 
  78.     if( SOCKET_ERROR ==send( sock, request.c_str(), request.size(), 0 ) ){ 
  79.         cout << "send error" <<endl; 
  80.         closesocket( sock ); 
  81.         return false
  82.     } 
  83.  
  84.     //接收数据 
  85.     int m_nContentLength = DEFAULT_PAGE_BUF_SIZE; 
  86.     char *pageBuf = (char *)malloc(m_nContentLength); 
  87.     memset(pageBuf, 0, m_nContentLength); 
  88.  
  89.     bytesRead = 0; 
  90.     int ret = 1; 
  91.     cout <<"Read: "
  92.     while(ret > 0){ 
  93.         ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0); 
  94.          
  95.         if(ret > 0) 
  96.         { 
  97.             bytesRead += ret; 
  98.         } 
  99.  
  100.         if( m_nContentLength - bytesRead<100){ 
  101.             cout << "\nRealloc memorry"<<endl; 
  102.             m_nContentLength *=2; 
  103.             pageBuf = (char*)realloc( pageBuf, m_nContentLength);      //重新分配内存 
  104.         } 
  105.         cout << ret <<" "
  106.     } 
  107.     cout <<endl; 
  108.  
  109.     pageBuf[bytesRead] = '\0'
  110.     response = pageBuf; 
  111.     closesocket( sock ); 
  112.     return true
  113.     //cout<< response <<endl; 
  114. //提取所有的URL以及图片URL 
  115. void HTMLParse ( string & htmlResponse, vector<string> & imgurls,const string & host ){ 
  116.     //找所有连接,加入queue中 
  117.     const char *p= htmlResponse.c_str(); 
  118.     char *tag="href=\""
  119.     const char *pos = strstr( p, tag ); 
  120.     ofstream ofile("url.txt", ios::app); 
  121.     while( pos ){ 
  122.         pos +=strlen(tag); 
  123.         const char * nextQ = strstr( pos,"\"" ); 
  124.         if( nextQ ){ 
  125.             char * url =new char[ nextQ-pos+1 ]; 
  126.             //char url[100]; //固定大小的会发生缓冲区溢出的危险 
  127.             sscanf( pos, "%[^\"]", url); 
  128.             string surl = url; 
  129.             if( visitedUrl.find( surl ) == visitedUrl.end() ){ 
  130.                 visitedUrl.insert( surl ); 
  131.                 ofile << surl<<endl; 
  132.                 hrefUrl.push( surl ); 
  133.             } 
  134.             pos = strstr(pos, tag ); 
  135.             delete [] url; 
  136.         } 
  137.     } 
  138.     ofile << endl << endl; 
  139.     ofile.close(); 
  140.  
  141.     tag ="<img "
  142.     const char* att1="src=\""
  143.     const char* att2="lazy-src=\""
  144.     const char *pos0 = strstr( p, tag ); 
  145.     while( pos0 ){ 
  146.         pos0 += strlen( tag ); 
  147.         const char* pos2 = strstr( pos0, att2 ); 
  148.         if( !pos2 || pos2 > strstr( pos0,">") ) 
  149.             pos = strstr( pos0, att1)+strlen(att1); 
  150.         else 
  151.             pos = pos2 + strlen(att2); 
  152.         const char * nextQ = strstr( pos,"\""); 
  153.         if( nextQ ){ 
  154.             char * url = newchar[nextQ-pos+1]; 
  155.             sscanf( pos, "%[^\"]", url); 
  156.             cout << url<<endl; 
  157.             string imgUrl = url; 
  158.             if( visitedImg.find( imgUrl ) == visitedImg.end() ){ 
  159.                 visitedImg.insert( imgUrl ); 
  160.                 imgurls.push_back( imgUrl ); 
  161.             } 
  162.             pos0 = strstr(pos0, tag ); 
  163.             delete [] url; 
  164.         } 
  165.     } 
  166.     cout << "end of Parse this html"<<endl; 
  167.  
  168. //把URL转化为文件名 
  169. string ToFileName( const string &url ){ 
  170.     string fileName; 
  171.     fileName.resize( url.size()); 
  172.     int k=0; 
  173.     for( int i=0; i<(int)url.size(); i++){ 
  174.         char ch = url[i]; 
  175.         if( ch!='\\'&&ch!='/'&&ch!=':'&&ch!='*'&&ch!='?'&&ch!='"'&&ch!='<'&&ch!='>'&&ch!='|') 
  176.             fileName[k++]=ch; 
  177.     } 
  178.     return fileName.substr(0,k) +".txt"
  179.  
  180. //下载图片到img文件夹 
  181. void DownLoadImg( vector<string> & imgurls,const string &url ){ 
  182.  
  183.     //生成保存该url下图片的文件夹 
  184.     string foldname = ToFileName( url ); 
  185.     foldname = "./img/"+foldname; 
  186.     if(!CreateDirectory( foldname.c_str(),NULL )) 
  187.         cout << "Can not create directory:"<< foldname<<endl; 
  188.     char *image; 
  189.     int byteRead; 
  190.     for( int i=0; i<imgurls.size(); i++){ 
  191.         //判断是否为图片,bmp,jgp,jpeg,gif  
  192.         string str = imgurls[i]; 
  193.         int pos = str.find_last_of("."); 
  194.         if( pos == string::npos ) 
  195.             continue
  196.         else
  197.             string ext = str.substr( pos+1, str.size()-pos-1 ); 
  198.             if( ext!="bmp"&& ext!="jpg" && ext!="jpeg"&& ext!="gif"&&ext!="png"
  199.                 continue
  200.         } 
  201.         //下载其中的内容 
  202.         if( GetHttpResponse(imgurls[i], image, byteRead)){ 
  203.             const char *p=image; 
  204.             const char * pos = strstr(p,"\r\n\r\n")+strlen("\r\n\r\n"); 
  205.             int index = imgurls[i].find_last_of("/"); 
  206.             if( index!=string::npos ){ 
  207. string imgname = imgurls[i].substr( index , imgurls[i].size() );
  208.                 ofstream ofile( foldname+imgname, ios::binary ); 
  209.                 if( !ofile.is_open() ) 
  210.                     continue
  211.                 cout <<g_ImgCnt++<< foldname+imgname<<endl; 
  212.                 ofile.write( pos, byteRead- (pos-p) ); 
  213.                 ofile.close(); 
  214.             } 
  215.             free(image); 
  216.         } 
  217.     } 
  218.  
  219.  
  220.  
  221. //广度遍历 
  222. void BFS( const string & url ){ 
  223.     char * response; 
  224.     int bytes; 
  225.     if( !GetHttpResponse( url, response, bytes ) ){ 
  226.         cout << "The url is wrong! ignore." << endl; 
  227.         return
  228.     } 
  229.     string httpResponse=response; 
  230.     free( response ); 
  231.     string filename = ToFileName( url ); 
  232.     ofstream ofile( "./html/"+filename ); 
  233.     if( ofile.is_open() ){ 
  234.         ofile << httpResponse << endl; 
  235.         ofile.close(); 
  236.     } 
  237.     vector<string> imgurls; 
  238.     HTMLParse( httpResponse,  imgurls, url ); 
  239.      
  240.     //下载图片资源 
  241.     DownLoadImg( imgurls, url ); 
  242. void main() 
  243.     WSADATA wsaData; 
  244.     if( WSAStartup(MAKEWORD(2,2), &wsaData) != 0 ){ 
  245.         return
  246.     } 
  247.     CreateDirectory( "./img",0); 
  248.     CreateDirectory("./html",0); 
  249.     //string urlStart = "http://hao.360.cn/meinvdaohang.html"; 
  250.     string urlStart = "http://hao.360.cn/meinvdaohang.html"
  251.     BFS( urlStart ); 
  252.     visitedUrl.insert( urlStart ); 
  253.     while( hrefUrl.size()!=0 ){ 
  254.         string url = hrefUrl.front(); 
  255.         cout << url << endl; 
  256.         BFS( url ); 
  257.         hrefUrl.pop(); 
  258.     } 
  259.     WSACleanup(); 
  260.     return

 

0 0
原创粉丝点击