设计自动获取网页和提交表单组件

来源:互联网 发布:笔记本mac地址修改器 编辑:程序博客网 时间:2024/04/30 12:43

    一次做列车时刻表时想到从网页上抓取数据,设计了这个类。

///网页获取器
class HtmlGetter
{
public:
 HtmlGetter();
 HtmlGetter::HtmlGetter(CString name) ;
 CString get_one_page(std::string url);
 ~HtmlGetter();
 CString get_page_by_post(std::vector<CString> inputs, CString pre_url);
 CString get_page_by_get(std::vector<CString> inputs, CString pre_url);
 void get_file(std::string url, CString SaveToFile);
 CString get_last_header();
 void Close();

private:
 CString LastHeader;
 CInternetSession session;
};

使用方法

  HtmlGetter htmlgetter;

  CString Page1=htmlgetter.get_one_page(http://163.com);

  //设置好网页form的inputs
  vector<CString> cmds;
  cmds.push_back("pawsoperator");//input name
  cmds.push_back("1");//input value

……


   //post,
  htmlgetter.get_page_by_post(cmds,"http://......");

实现如下:


#include "stdafx.h"


#include "string"
#include "vector"
#include <iostream>
#include <fstream>
#include "./htmlgetter.h"
using namespace std;

//char HtmlGetter::SessSerial=0;
//CString HtmlGetter::SessName;
HtmlGetter::HtmlGetter() : session("session_")
{
}
HtmlGetter::HtmlGetter(CString name) : session(name)
{
}

HtmlGetter::~HtmlGetter()
{
 session.Close();
}
void HtmlGetter::Close()
{
 session.Close();
}

CString HtmlGetter::get_last_header()
{
 return LastHeader;
}
///获取网页
CString HtmlGetter::get_one_page(std::string url)
{
 CStdioFile* pFile = NULL;
 char szBuff[1024];
 CString Ret;
 try
 {
  pFile = session.OpenURL(url.c_str());
 }
 catch (CInternetException* )
 {
  logger->warn("OpenURL exception");

  return CString(/*"<html><body>*/"OpenURL exception. ");/*I will retry " +
                  GetConfString("general", "retry") +
                  " times.<br><br><br>PMKPkZ1AFf08CkMUPgvMuJiw</body></html>");*/
 }

 //LPVOID  headbuf=NULL;
 //DWORD len;
 CString header1;

 //((CHttpFile*)pFile)->QueryInfo(HTTP_QUERY_FLAG_REQUEST_HEADERS,headbuf,&len);
 if (((CHttpFile *) pFile)->QueryInfo(HTTP_QUERY_RAW_HEADERS_CRLF, header1) ==
  TRUE)
 {
  logger->debugStream() << "http header:" << header1;
  LastHeader = header1;
  //delete[]headbuf;
 }

 memset(szBuff, 0, sizeof(szBuff));

 UINT nRead = pFile->Read(szBuff, 1023);
 while (nRead > 0)
 {
  Ret += (szBuff);
  memset(szBuff, 0, sizeof(szBuff));
  nRead = pFile->Read(szBuff, 1023);
 }

 delete pFile;

 return Ret;
}

/**下载文件

@return 文件
@author 张浩

*/
void HtmlGetter::get_file(std::string url, CString SaveToFile)
{
 //下载文件必须用二进制打开
 std::ofstream savefile(SaveToFile, ios::binary | ios::out);
 CStdioFile* pFile = NULL;
 char szBuff[1024];

 try
 {
  pFile = session.OpenURL(url.c_str());
 }
 catch (CInternetException* )
 {
  logger->warn("OpenURL exception");

  return ;//CString("<html><body>OpenURL exception</body></html>");
 }
 memset(szBuff, 0, sizeof(szBuff));

 UINT nRead = pFile->Read(szBuff, 1023);
 while (nRead > 0)
 {
  //Ret += (szBuff);
  savefile.write(szBuff, nRead);
  memset(szBuff, 0, sizeof(szBuff));
  nRead = pFile->Read(szBuff, 1023);
 }

 delete pFile;

 //return Ret;
}

/**POST提交后返回页面

@param inputs name value 对
@param pre_url 提交至action页面
@return 页面
@author 张浩

*/

CString HtmlGetter::get_page_by_post(std::vector<CString> inputs,
 CString pre_url)
{
 //pre_url http://ffsfsdfdsfs:80/fsdfs/fsd.asp
 int pos = pre_url. Find('/'), pos2;
 pos2 = pos;
 if (pre_url.Find('/', pos + 1) == pos + 1)
  pos2 = pre_url.Find('/', pos + 2);
 CString server = (pos == pos2) ?
  (pre_url.Left(pos)) :
  (pre_url.Mid(pos + 2, pos2 - pos - 2));
 //server ffsfsdfdsfs:80

 CString actionform = pre_url.Mid(pos2);
 //actionform /fsdfs/fsd.asp

 CString Port = "80";
 if (server.Find(':') != -1)
 {
  Port = server.Mid(server.Find(':') + 1);
  server = server.Left(server.Find(':'));
 }
 INTERNET_PORT port = atoi(Port);


 logger->debugStream() << "actionform " << actionform;
 CString strHeaders = _T("Content-Type: application/x-www-form-urlencoded");
 // URL-encoded form variables -
 // name = "John Doe", userid = "hithere", other = "P&Q"


 //url加密
 CString params;
 if ((inputs.size() % 2) != 0)//不是2的倍数,说明解析错?
 {
  logger->error("parse post inputs error ");//of << "parse cmd error" << endl;
  return CString("<html><body>parse post inputs error. ");/*I will retry " +
                  GetConfString("general", "retry") +
                  " times.<br><br><br>PMKPkZ1AFf08CkMUPgvMuJiw</body></html>");*/
 }

 for (vector<CString>::iterator i = inputs.begin();
  i != inputs.end();
  ++i,++i)
 {
  if (i != inputs.begin())
   params += '&';
  params += UrlEncode(i->GetBuffer(i->GetLength()));
  logger->debugStream() << "UrlEncode param1:"
   << i->GetBuffer(i->GetLength()) << " to "
   << UrlEncode(i->GetBuffer(i->GetLength()));
  params += '=';
  vector<CString>::iterator j = i;++j;
  params += UrlEncode(j->GetBuffer(j->GetLength()));
  logger->debugStream() << "UrlEncode param2:"
   << j->GetBuffer(j->GetLength()) << " to "
   << UrlEncode(j->GetBuffer(j->GetLength()));
 }

 CString strFormData = params;//_T(params.GetBuffer(params.GetLength())/* "search_txt=xmlspy"*/);

 logger->debugStream() << "strFormData: " << strFormData;

 CString Ret;

 try
 {
  //CInternetSession session;
  CHttpConnection* pConnection = session.GetHttpConnection(server/*_T("ServerNameHere")*/,
            port);
  CHttpFile* pFile = pConnection->OpenRequest(CHttpConnection::HTTP_VERB_POST,
           actionform/*_T("FormActionHere")*/);
  BOOL result = pFile->SendRequest(strHeaders,
        (LPVOID) (LPCTSTR) strFormData,
        strFormData.GetLength());

  char szBuff[1024];
  memset(szBuff, 0, sizeof(szBuff));

  UINT nRead = pFile->Read(szBuff, 1023);
  while (nRead > 0)
  {
   Ret += (szBuff);
   memset(szBuff, 0, sizeof(szBuff));
   nRead = pFile->Read(szBuff, 1023);
  }

  delete pFile;
 }
 catch (CInternetException* )
 {
  logger->warn("get_page_by_post exception");

  return CString("<html><body>get_page_by_post exception .");/* maybe url is not exist. I will retry " +
                      GetConfString("general", "retry") +
                      " times.<br><br><br>PMKPkZ1AFf08CkMUPgvMuJiw</body></html>");*/
 }


 return Ret;
}

/**get提交后返回页面
方法:先将参数url编码,再调用get_one_page
@param inputs name value 对
@param pre_url 提交至action页面
@return 页面
@author 张浩

*/

CString HtmlGetter::get_page_by_get(std::vector<CString> inputs,
 CString pre_url)
{
 //url加密
 CString params;
 if ((inputs.size() % 2) != 0)//不是2的倍数,说明解析错?
 {
  logger->error("parse get inputs error ");//of << "parse cmd error" << endl;
  return CString("<html><body>parse get inputs error.");/* I will retry " +
                  GetConfString("general", "retry") +
                  " times.<br><br><br>PMKPkZ1AFf08CkMUPgvMuJiw</body></html>");*/
 }

 for (vector<CString>::iterator i = inputs.begin();
  i != inputs.end();
  ++i,++i)
 {
  if (i != inputs.begin())
   params += '&';
  params += UrlEncode(i->GetBuffer(i->GetLength()));
  logger->debugStream() << "UrlEncode param1:"
   << i->GetBuffer(i->GetLength()) << " to "
   << UrlEncode(i->GetBuffer(i->GetLength()));
  params += '=';
  vector<CString>::iterator j = i;++j;
  params += UrlEncode(j->GetBuffer(j->GetLength()));
  logger->debugStream() << "UrlEncode param2:"
   << j->GetBuffer(j->GetLength()) << " to "
   << UrlEncode(j->GetBuffer(j->GetLength()));
 }

 std::string newurl = pre_url + "?" + params;

 logger->debugStream() << "newurl: " << newurl;


 return this->get_one_page(newurl);
}


原创粉丝点击