Linux获取网页源码的几种方法
来源:互联网 发布:网络教育自我鉴定 编辑:程序博客网 时间:2024/05/16 01:46
第一个为利用linux下的工具来获取网页源码,我用的是Wget,也可以使用Curl,curl的话更加的灵活,可以设置很多参数;
//通过Wget来获取网页string GetHtmlByWget(string url){ //获取待下载网页文件名 string fileName = url.substr((int)url.find_last_of("/") + 1); if(fileName != "") { string strCom = "wget -q "; //wget命令,-q表示不显示下载信息 strCom.append(url); system(strCom.c_str()); //执行wget ifstream fin(fileName.c_str()); if(!fin) { return ""; } string strHtml = ""; char chTemp[1024] = ""; //读取网页文件到内存中 while(fin.getline(chTemp , 1024)) { strHtml.append(string(chTemp)); strcpy(chTemp , ""); } fin.close(); strCom = "rm -f "; //删除文件命令,-f表示直接删除不做任何提示 strCom.append(fileName); system(strCom.c_str()); //删除刚才下载下来的文件 return strHtml; //返回网页源码 } else { return ""; }}
第二个是用的socket的来获取源码:
//通过GET获取网页源码string GetHtmlByGet(string url){ string strHtmlContent = ""; int sockfd; struct sockaddr_in addr; struct hostent *pURL; char text[RECVBUF]; //分析链接 UrlInfo urlInfo = ParseURL(url); string sAccept = "Accept: */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate"; //不同的主机UserAgent不同 string sUserAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10"; //将端口转换为字符串 char t[6]; string strPort; sprintf(t,"%d", urlInfo.Port); strPort = t; //构造发送字符串 string strRequest = ""; strRequest.append("GET "); strRequest.append(urlInfo.File); strRequest.append("?"); strRequest.append(urlInfo.Body); strRequest.append(" HTTP/1.1\r\n"); strRequest.append(sAccept); strRequest.append("\r\nUser-Agent:"); strRequest.append(sUserAgent); strRequest.append("\r\nHost:"); strRequest.append(urlInfo.Host); strRequest.append(":"); strRequest.append(strPort); strRequest.append("\r\nConnection: Keep-Alive\r\n\r\n"); char* host = const_cast<char*>(urlInfo.Host.c_str()); sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); //TCP方式发送 pURL = gethostbyname(host); addr.sin_family = AF_INET; addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr); addr.sin_port = htons(80); //连接 connect(sockfd,(struct sockaddr *)&addr,sizeof(addr)); //发送 send(sockfd, const_cast<char*>(strRequest.c_str()), strRequest.length(), 0); //接受 while(recv(sockfd, text, RECVBUF, 0) > 0) { strHtmlContent.append(text); bzero(text,RECVBUF); } //关闭socket close(sockfd); //返回接受结果 return strHtmlContent;}
第三个使用libcurl:
#include <stdio.h> #include <string.h> #include <curl/curl.h> #define MAX_BUF 65536 char wr_buf[MAX_BUF+1]; int wr_index; /* * Write data callback function (called within the context of * curl_easy_perform. */ size_t write_data( void *buffer, size_t size, size_t nmemb, void *userp ) { int segsize = size * nmemb; /* Check to see if this data exceeds the size of our buffer. If so, * set the user-defined context value and return 0 to indicate a * problem to curl. */ if ( wr_index + segsize > MAX_BUF ) { *(int *)userp = 1; return 0; } /* Copy the data from the curl buffer into our buffer */ memcpy( (void *)&wr_buf[wr_index], buffer, (size_t)segsize ); /* Update the write index */ wr_index += segsize; /* Null terminate the buffer */ wr_buf[wr_index] = 0; /* Return the number of bytes received, indicating to curl that all is okay */ return segsize; } /* * Simple curl application to read the index.html file from a Web site. */ int main( void ) { CURL *curl; CURLcode ret; int wr_error; wr_error = 0; wr_index = 0; /* First step, init curl */ curl = curl_easy_init(); if (!curl) { printf("couldn't init curl\n"); return 0; } /* Tell curl the URL of the file we're going to retrieve */ curl_easy_setopt( curl, CURLOPT_URL, "www.exampledomain.com" ); /* Tell curl that we'll receive data to the function write_data, and * also provide it with a context pointer for our error return. */ curl_easy_setopt( curl, CURLOPT_WRITEDATA, (void *)&wr_error ); curl_easy_setopt( curl, CURLOPT_WRITEFUNCTION, write_data ); /* Allow curl to perform the action */ ret = curl_easy_perform( curl ); printf( "ret = %d (write_error = %d)\n", ret, wr_error ); /* Emit the page if curl indicates that no errors occurred */ if ( ret == 0 ) printf( "%s\n", wr_buf ); curl_easy_cleanup( curl ); return 0; }
0 0
- Linux获取网页源码的几种方法
- Linux获取网页源码的几种方法 linux爬虫程序
- Linux获取网页源码的几种方法 linux爬虫程序
- C#获取指定网页源码的几种方法
- Linux下获取软件源码的几种方法
- 获取网页内容的几种方法
- asp.net C# 获取网页源码的几种方式
- c#获取网页源代码的几种方法
- JS获取网页中HTML元素的几种方法
- JS获取网页中HTML元素的几种方法
- c#获取网页源代码的几种方法
- JS获取网页中HTML元素的几种方法
- JS获取网页中HTML元素的几种方法
- PHP获取网页内容的几种方法
- PHP获取网页内容的几种方法
- linux获取文件大小的几种方法
- php获取网页内容几种方法
- linux获取网页源码
- 基于比较的排序算法的最优下界为什么是O(nlogn)
- git学习
- Android之rild进程启动源码分析
- 认真是我们最好的人生态度
- 配置 Sublime Text 方便查看Scala 编译器的解析结果
- Linux获取网页源码的几种方法
- 【EXCEL】update拷贝数字
- 【stagefrightplayer】4 OMX Codec介绍 (2/2)
- Android 4.4 Kit Kat 源码下载
- weblogic下面增加虚拟目录
- 跟着《算法导论》学习——快速排序
- eclipse中variable references non-existion resource可能原因及解决方案
- 深度学习:又一次推动AI梦想
- uestc 第五届ACM趣味程序设计竞赛第二场(正式赛)棋盘