通过代理服务器下载网页

来源:互联网 发布:网络剧男演员 编辑:程序博客网 时间:2024/05/18 09:15
 

通过代理服务器下载网页

这几天研究通过代理服务器下载网页的相关东西,发现从网上根本找不到相关的代码,最后还是把wget的源码拿来研究了一下,才算明白了一点。

具体步骤如下:

1.   第一步,连接到代理服务器。

 // 建立到服务端的连接

int my_connect( uint32_t *pServerAddress , unsigned short ServerPort, const char * pHost, int nTimeout,

       const char* pLocalAddress, unsigned short sLocalPort,URLS *pUrl )

{

 

    int nTry = 0;

    char    strConn[255];

   

    int nSock = socket(AF_INET, SOCK_STREAM, 0);

    if( nSock < 0 )

        return -1;

   

    // 配置服务地址

    struct sockaddr_in serverAddr, clientAddr;

    char* ip = (char*)pServerAddress;

    int nReuse = 1;                                                                                                                       

   

    memset(&serverAddr, 0, sizeof(serverAddr));

    serverAddr.sin_family = AF_INET;

    if( !pServerAddress  ) // 未指定地址

        goto fail;

   

    serverAddr.sin_addr.s_addr = *pServerAddress;

   

    serverAddr.sin_port = htons(ServerPort);

   

    if( !setNonBlocking(nSock,true) )

        goto fail;

   

    while( ::connect(nSock, (sockaddr*)&serverAddr, sizeof(serverAddr)) == -1 )

    {

        if( errno == EADDRNOTAVAIL || errno == ENOTCONN )

        {

            //fprintf(stderr,"sock try %d\n",errno);

            //usleep(10000);

            nTry++;

            if( nTry<1 )

                continue;

        }

        if( errno != EINPROGRESS )

            goto fail;

        break;

        goto fail;

    }

   

   

    return nSock;

   

fail:

    printf("connect error %d %s \n",errno,strerror(errno));

    close(nSock);

    return -1;  

}

2.    发送Get请求,并接收数据,格式如下:

GET http://3g.sina.com.cn/ HTTP/1.0

User-Agent: Wget/1.11.1

Accept: */*

Host: 3g.sina.com.cn

 

 

代码如下:

int FileNum= 0;

int iGetData(int sock,URLS *pUrltest)

{

   

    epoll_event* events = new epoll_event[2]; 

    int epfd = epoll_create(2);

 

    struct epoll_event ev;

    memset(&ev, 0, sizeof(ev));

    ev.data.ptr = (void*)sock;

    ev.events = EPOLLOUT;

    if( epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) == -1 )

    {

        delete []events;

              return 1;

       }

    while( true )

    {

           // wait each connect for ever

           int nFds = epoll_wait(epfd, events, 2, MAX_TIME_OUT);

           int i;

          

 

        if(nFds <= 0)

        {

            delete []events;

            return -1;

        }

      

 

           for( i=0; i<nFds; i++ )

           {

             int nEventSock = (int)(long)events[i].data.ptr;            

             //fprintf(stderr,"event %x\n",events[i].events);

           }

           // connect ok

           setNonBlocking(sock,false);

           char* buf = new char[1024*1024];

           char* outbuf = new char[1024*1024*10];

           char* ua = getenv("USERAGENT");

           if( !ua || strlen(ua)==0 )

                  ua = "";

           char* acp = getenv("ACCEPT");

           if( !acp || strlen(acp)==0 )

                  acp = "*/*";

           char* ckie = getenv("COOKIE");

           if( !ckie || strlen(ckie)==0 )

                  ckie = "";

           char* p = buf;

           if( pUrltest->port != 80 )

           {

                  p += sprintf(buf,"GET %s HTTP/1.0\r\n"

            ,pUrltest->m_url.c_str());

           }

           else

           {

                  p += sprintf(buf,"GET %s HTTP/1.0\r\n"

             ,pUrltest->m_url.c_str());

           }

            p+= sprintf(p,"User-Agent: %s\r\n","Wget/1.11.1");

 

 

              if( ckie[0] )

                     p += sprintf(p,"Cookie: %s\r\n",ckie);

              p += sprintf(p,"Accept: */*\r\n");

        p += sprintf(p,"Host: %s\r\n\r\n",pUrltest->host.c_str());

 

 

       

           //fprintf(stderr,"\n%s\n",buf);

        //printf("%s\n",buf);

       

           if( writeBlock1(sock,buf,strlen(buf)) != strlen(buf) )

           {

                  fprintf(stderr,"write failed %d\n",errno);

            delete []events;

            delete []buf;

            delete []outbuf;

                  return 1;

           }

           int n = readBlock(sock,buf,1024*1023);

#if 1

           if( n > 0 )

           {

                     buf[n] = '\0';

                  char* ph = strstr(buf,"\r\n\r\n");

                  while( ph )

                  {

                         ph += 4;

                         std::string head = std::string(buf,ph-buf);

                            char* p = strstr(head.c_str(),"Transfer-Encoding:");

                            if( p )

                            {

                                   int size = 0;

                                   char* pb = ph;

                                   int nChunkLen = -1;

                                   char* p1 = pb;

                                   while( pb-buf < n )

                                   {

                                          // chunk begin

                                          SKIP_BLANK(p1);

                                          char* p0 = strchr(p1,'\n');

                                          if( !p0 )

                                                 break;

                                          char* p2 = p1;

                                          FIND_BLANK(p2);

                                          nChunkLen = hexdec(p1,p2-p1);

                                          p2 = p0+1; // chunk head

                                          if( nChunkLen < 0 || nChunkLen == 0 )

                                                 break;

                                          if( nChunkLen > (int)n-(p2-buf) )

                                                 nChunkLen = n-(p2-buf);

                                          p1 = p2+nChunkLen; // chunk tvend

                                          memmove(pb,p2,nChunkLen);

                                          size += nChunkLen;

                                          p0 = strchr(p1,'\n');

                                          if( !p0 )

                                                 break;

                                          pb = ph+size;

                                          p1 = p0+1;

                                   }

                                   n = size+(ph-buf);

                            }

                            p = strstr(head.c_str(),"Content-Encoding:");

                            if( p )

                            {

                                   p += 17;

                                   while(*p&&*p==' ')p++;

                                   if( !*p )

                                          break;

                                   char* p1 = p;

                                   while(*p&&*p!='\r')p++;

                                   std::string type = std::string(p1,p-p1);

                                   CHttpDecompress decomp;

                                   if( decomp.setType(type.c_str()) != HTTP_DECOMPRESS_OK )

                                          break;

                                   CompressStruct cs;

                                   memset(&cs,0,sizeof(cs));

                                   cs.m_inBuf = (Bytef*)ph;

                                   cs.m_inLen = n-(ph-buf);

                                   char* obuf = new char[1024*1024*9+head.size()];

                                   cs.m_outBuf = (Bytef*)obuf+head.size();

                                   cs.m_outLen = 1024*1024*9;     

                                   if( decomp.decompress(cs) < 0 )

                                   {

                                          delete[] obuf;

                                          break;

                                   }

                                   memcpy(obuf,head.c_str(),head.size());

                                   delete[] buf;

                                   buf = obuf;

                                   n = cs.m_routLen+head.size();

                            }

                            break;

                     }

          

           }

 

#endif           

#ifdef PRINT_OUT         

          char acFileName[1024];

          sprintf(acFileName,"%d.html",FileNum);

          FileNum++;

          FILE *pf = fopen(acFileName,"a+");

         

          fwrite(buf,strlen(buf),1,pf);

          fclose(pf);

#endif

            delete []buf;

            delete []outbuf;

           break;

    }

    close(epfd);

           close(sock);

    delete[] events;

       return 0;

}

原创粉丝点击