考虑模拟浏览器的爬虫设计

来源:互联网 发布:电脑数据恢复大师免费 编辑:程序博客网 时间:2024/06/06 02:56

httpclient3 自动登陆淘宝, 开心网


需要用到的java包 
commons-httpclient-3.1.jar 
commons-logging.jar 
log4j-1.2.15.jar 
commons-codec.jar

登陆淘宝的例子:

Java代码 
  1. package com.spider.cron;  
  2.   
  3.   
  4. import java.io.IOException;  
  5.   
  6. import org.apache.commons.httpclient.Cookie;  
  7. import org.apache.commons.httpclient.Header;  
  8. import org.apache.commons.httpclient.HttpClient;  
  9. import org.apache.commons.httpclient.HttpException;  
  10. import org.apache.commons.httpclient.NameValuePair;  
  11. import org.apache.commons.httpclient.methods.GetMethod;  
  12. import org.apache.commons.httpclient.methods.PostMethod;  
  13.   
  14.   
  15. public class TaoBaoLogin {  
  16.   
  17.         private static final String LOGON_SITE = "http://www.taobao.com";  
  18.         private static final int LOGON_PORT = 80;  
  19.       
  20.         private static final String TAOBAO_BASE_LOGIN_BEFORE = "http://member1.taobao.com/member/login.jhtml?f=top&redirectURL=http%3A%2F%2Fwww.taobao.com%2F";  
  21.         private static final String TAOBAO_BASE_LOGIN = "http://login.taobao.com/member/login.jhtml";  
  22.   
  23.         public static void main(String args[]) throws HttpException, IOException {  
  24.             String taobaoUser="woshigoojje@163.com";  
  25.               
  26.             String taobaoPwd="3DES_2_000000000000000000000000000000_61F0B8BE021BBBDD020919017B6816F5";  
  27.             String taobaoTid="XOR_1_000000000000000000000000000000_63584054400B0F717B750370";  
  28.               
  29.             HttpClient client = new HttpClient();  
  30.             client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);  
  31.               
  32.           
  33.             String _tb_token_Value="";  
  34.             Cookie[] cookies = client.getState().getCookies();          
  35.             String responseString = processGet(client,null,TAOBAO_BASE_LOGIN_BEFORE,cookies,true,true);     
  36.   
  37.            
  38.             responseString=responseString.substring(responseString.indexOf("_tb_token_")+"_tb_token_".length());  
  39.             responseString=responseString.substring(responseString.indexOf("_tb_token_")+"_tb_token_".length());  
  40.             _tb_token_Value=responseString.substring(responseString.indexOf("value=")+"value='".length(),responseString.indexOf(">")-1);  
  41.              
  42.             PostMethod post = new PostMethod(TAOBAO_BASE_LOGIN);  
  43.             NameValuePair[] params= new NameValuePair[] {                 
  44.             new NameValuePair("_oooo_"""),  
  45.             new NameValuePair("_tb_token_", _tb_token_Value),  
  46.             new NameValuePair("abtest"""),  
  47.             new NameValuePair("action""Authenticator"),  
  48.             new NameValuePair("actionForStable""enable_post_user_action"),  
  49.             new NameValuePair("CtrlVersion""1,0,0,7"),  
  50.             new NameValuePair("done"""),  
  51.             new NameValuePair("event_submit_do_login""anything"),  
  52.             new NameValuePair("from"""),  
  53.             new NameValuePair("loginType""4"),  
  54.             new NameValuePair("mcheck"""),  
  55.             new NameValuePair("mi_uid"""),  
  56.             new NameValuePair("pstrong"""),  
  57.             new NameValuePair("support""000001"),  
  58.             new NameValuePair("tid", taobaoTid),  
  59.             new NameValuePair("TPL_password", taobaoPwd),  
  60.             new NameValuePair("TPL_redirect_url"""),  
  61.             new NameValuePair("TPL_redirect_url"""),  
  62.             new NameValuePair("TPL_redirect_url"""),  
  63.             new NameValuePair("TPL_redirect_url"""),  
  64.             new NameValuePair("TPL_redirect_url"""),  
  65.             new NameValuePair("TPL_redirect_url"""),  
  66.             new NameValuePair("TPL_username", taobaoUser),  
  67.             new NameValuePair("yparam""")  
  68.             };  
  69.             processPost(client, post, TAOBAO_BASE_LOGIN, params, cookies, truefalse);  
  70.             Header header=post.getResponseHeader("Location");  
  71.             String redirectUrl=header.getValue();  
  72.               
  73.               
  74.             responseString = processGet(client, null, redirectUrl, cookies, truetrue);  
  75.               
  76.               
  77.             responseString=responseString.substring(0,responseString.indexOf("我的彩票"));  
  78.             System.out.println("main(String[]) - " + responseString);   
  79.   
  80.             String caiPiaoUrl=responseString.substring(responseString.lastIndexOf("<a")+"<a href='".length(),responseString.lastIndexOf(">")-1);  
  81.       
  82.             processGet(client, null, caiPiaoUrl, cookies, truefalse);  
  83.   
  84.               
  85.               
  86.         }  
  87.           
  88.           
  89.   
  90.         public static String processGet(HttpClient client,GetMethod get,String url,Cookie[] cookies,boolean needAppendCookies,boolean needResponse) throws IOException{  
  91.             if(client==null || url==null || url==""return "";  
  92.             if(get==null)  
  93.                 get=new GetMethod();  
  94.             get = new  GetMethod(url);    
  95.             if(cookies!=null)  
  96.                 get.setRequestHeader("Cookie" , cookies.toString());  
  97.             client.executeMethod(get);  
  98.             if(needAppendCookies){  
  99.                 cookies = client.getState().getCookies();     
  100.                 client.getState().addCookies(cookies);     
  101.             }  
  102.             if(needResponse)  
  103.                 return get.getResponseBodyAsString();     
  104.             get.releaseConnection();  
  105.             return "";  
  106.         }  
  107.           
  108.   
  109.         public static String processPost(HttpClient client,PostMethod post,String url,NameValuePair[] params,Cookie[] cookies,boolean needAppendCookies,boolean needResponse) throws IOException{  
  110.             if(client==null || url==null || url==""return "";  
  111.             if(post==null)  
  112.                 post = new PostMethod(url);  
  113.               
  114.             if(params!=null && params.length>0)  
  115.                 post.setRequestBody(params);  
  116.             if(cookies!=null)  
  117.                 post.setRequestHeader("Cookie" , cookies.toString());  
  118.             client.executeMethod(post);  
  119.             if(needAppendCookies){  
  120.                 cookies = client.getState().getCookies();     
  121.                 client.getState().addCookies(cookies);     
  122.             }  
  123.             if(needResponse)  
  124.                 return post.getResponseBodyAsString();  
  125.             post.releaseConnection();  
  126.             return "";  
  127.         }  
  128.           
  129.   
  130.         public static String processDetail(String tempStr,String startFlag,String endFlag,int starts,int ends){  
  131.             if(tempStr==null || "".equals(tempStr)) return "";  
  132.             int start=tempStr.indexOf(startFlag);  
  133.             int end=tempStr.indexOf(endFlag);  
  134.             if(start==-1 || end==-1 || (end-ends)<(start+starts)) return "";  
  135.             try{  
  136.             tempStr=tempStr.substring(start+starts,end-ends);  
  137.             }catch(Exception e){  
  138.                 System.out.println("processDetail(String, String, String, int, int) " + e.toString());   
  139.                 return "";  
  140.             }  
  141.             return tempStr;  
  142.         }  
  143.   
  144. }  

 

 

 

登陆开心网的例子:

 

Java代码 
  1. package com.spider.cron;  
  2.   
  3. import org.apache.commons.httpclient.Cookie;  
  4. import org.apache.commons.httpclient.HttpClient;  
  5. import org.apache.commons.httpclient.NameValuePair;  
  6. import org.apache.commons.httpclient.methods.GetMethod;  
  7. import org.apache.commons.httpclient.methods.PostMethod;  
  8.   
  9. public class LoginKaixin {  
  10.     private static final String LOGON_SITE = "http://www.kaixin001.com";  
  11.   
  12.     private static final int LOGON_PORT = 80;  
  13.   
  14.     public static void main(String[] args) throws Exception {  
  15.         HttpClient client = new HttpClient();  
  16.         client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);  
  17.   
  18.         // 登录页面  
  19.         PostMethod post = new PostMethod(  
  20.                 "http://www.kaixin001.com/login/login.php");  
  21.         NameValuePair ie = new NameValuePair("User-Agent",  
  22.                 "Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)");  
  23.         NameValuePair url = new NameValuePair("url""/home/");  
  24.         NameValuePair username = new NameValuePair("email""xxx@163.com");  
  25.         NameValuePair password = new NameValuePair("password""xxxxxx");  
  26.         post.setRequestBody(new NameValuePair[] { ie, url, username, password });  
  27.         client.executeMethod(post);  
  28.         System.out  
  29.                 .println("******************************登录******************************");  
  30.         Cookie[] cookies = client.getState().getCookies();  
  31.         client.getState().addCookies(cookies);  
  32.         post.releaseConnection();  
  33.         System.out  
  34.                 .println("******************************页面转向******************************");  
  35.         String newUrl = "http://www.kaixin001.com/home/";  
  36.         System.out.println("==========Cookies============");  
  37.         int i = 0;  
  38.         for (Cookie c : cookies) {  
  39.             System.out.println(++i + ":   " + c);  
  40.         }  
  41.         client.getState().addCookies(cookies);  
  42.         post.releaseConnection();  
  43.         GetMethod get = new GetMethod(newUrl);  
  44.         get.setRequestHeader("Cookie", cookies.toString());  
  45.         client.executeMethod(get);  
  46.         String responseString = get.getResponseBodyAsString();  
  47.         // 登录后首页的内容  
  48.         System.out.println(responseString);  
  49.         get.releaseConnection();  
  50.         System.out  
  51.                 .println("******************************组件功能******************************");  
  52.         // "http://www.kaixin001.com/!slave/index.php", "朋友买卖"  
  53.         // "http://www.kaixin001.com/!parking/index.php", "争车位"  
  54.         // "http://www.kaixin001.com/!house/index.php?_lgmode=pri", "买房子"  
  55.         // http://www.kaixin001.com/!house/index.php?_lgmode=pri&t=49  
  56.         // "http://www.kaixin001.com/!house/garden/index.php","花园"  
  57.         // (1)进入朋友买卖****************  
  58.         System.out  
  59.                 .println("******************************(1)进入朋友买卖******************************");  
  60.         String slave = "http://www.kaixin001.com/!slave/index.php";  
  61.         get = new GetMethod(slave);  
  62.         get.setRequestHeader("Cookie", cookies.toString());  
  63.         client.executeMethod(get);  
  64.         responseString = get.getResponseBodyAsString();  
  65.         System.out.println(responseString);  
  66.         get.releaseConnection();  
  67.         // (2)进入争车位****************  
  68.         System.out  
  69.                 .println("******************************(2)进入争车位******************************");  
  70.         String parking = "http://www.kaixin001.com/!parking/index.php";  
  71.         get = new GetMethod(parking);  
  72.         get.setRequestHeader("Cookie", cookies.toString());  
  73.         client.executeMethod(get);  
  74.         responseString = get.getResponseBodyAsString();  
  75.         System.out.println(responseString);  
  76.         get.releaseConnection();  
  77.         // (3)进入买房子****************  
  78.         System.out  
  79.                 .println("******************************(3)进入买房子*******************************");  
  80.         String house = "http://www.kaixin001.com/!house/index.php?_lgmode=pri&t=49";  
  81.         get = new GetMethod(house);  
  82.         get.setRequestHeader("Cookie", cookies.toString());  
  83.         client.executeMethod(get);  
  84.         responseString = get.getResponseBodyAsString();  
  85.         System.out.println(responseString);  
  86.         get.releaseConnection();  
  87.         // (4)进入花园****************  
  88.         System.out  
  89.                 .println("******************************(4)进入花园*******************************");  
  90.         String garden = "http://www.kaixin001.com/!house/garden/index.php";  
  91.         get = new GetMethod(garden);  
  92.         get.setRequestHeader("Cookie", cookies.toString());  
  93.         client.executeMethod(get);  
  94.         responseString = get.getResponseBodyAsString();  
  95.         System.out.println(responseString);  
  96.         get.releaseConnection();  
  97.   
  98.     }  
  99.   
  100. }