考虑模拟浏览器的爬虫设计
来源:互联网 发布:电脑数据恢复大师免费 编辑:程序博客网 时间:2024/06/06 02:56
httpclient3 自动登陆淘宝, 开心网
需要用到的java包
commons-httpclient-3.1.jar
commons-logging.jar
log4j-1.2.15.jar
commons-codec.jar
登陆淘宝的例子:
- package com.spider.cron;
- import java.io.IOException;
- import org.apache.commons.httpclient.Cookie;
- import org.apache.commons.httpclient.Header;
- import org.apache.commons.httpclient.HttpClient;
- import org.apache.commons.httpclient.HttpException;
- import org.apache.commons.httpclient.NameValuePair;
- import org.apache.commons.httpclient.methods.GetMethod;
- import org.apache.commons.httpclient.methods.PostMethod;
- public class TaoBaoLogin {
- private static final String LOGON_SITE = "http://www.taobao.com";
- private static final int LOGON_PORT = 80;
- private static final String TAOBAO_BASE_LOGIN_BEFORE = "http://member1.taobao.com/member/login.jhtml?f=top&redirectURL=http%3A%2F%2Fwww.taobao.com%2F";
- private static final String TAOBAO_BASE_LOGIN = "http://login.taobao.com/member/login.jhtml";
- public static void main(String args[]) throws HttpException, IOException {
- String taobaoUser="woshigoojje@163.com";
- String taobaoPwd="3DES_2_000000000000000000000000000000_61F0B8BE021BBBDD020919017B6816F5";
- String taobaoTid="XOR_1_000000000000000000000000000000_63584054400B0F717B750370";
- HttpClient client = new HttpClient();
- client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);
- String _tb_token_Value="";
- Cookie[] cookies = client.getState().getCookies();
- String responseString = processGet(client,null,TAOBAO_BASE_LOGIN_BEFORE,cookies,true,true);
- responseString=responseString.substring(responseString.indexOf("_tb_token_")+"_tb_token_".length());
- responseString=responseString.substring(responseString.indexOf("_tb_token_")+"_tb_token_".length());
- _tb_token_Value=responseString.substring(responseString.indexOf("value=")+"value='".length(),responseString.indexOf(">")-1);
- PostMethod post = new PostMethod(TAOBAO_BASE_LOGIN);
- NameValuePair[] params= new NameValuePair[] {
- new NameValuePair("_oooo_", ""),
- new NameValuePair("_tb_token_", _tb_token_Value),
- new NameValuePair("abtest", ""),
- new NameValuePair("action", "Authenticator"),
- new NameValuePair("actionForStable", "enable_post_user_action"),
- new NameValuePair("CtrlVersion", "1,0,0,7"),
- new NameValuePair("done", ""),
- new NameValuePair("event_submit_do_login", "anything"),
- new NameValuePair("from", ""),
- new NameValuePair("loginType", "4"),
- new NameValuePair("mcheck", ""),
- new NameValuePair("mi_uid", ""),
- new NameValuePair("pstrong", ""),
- new NameValuePair("support", "000001"),
- new NameValuePair("tid", taobaoTid),
- new NameValuePair("TPL_password", taobaoPwd),
- new NameValuePair("TPL_redirect_url", ""),
- new NameValuePair("TPL_redirect_url", ""),
- new NameValuePair("TPL_redirect_url", ""),
- new NameValuePair("TPL_redirect_url", ""),
- new NameValuePair("TPL_redirect_url", ""),
- new NameValuePair("TPL_redirect_url", ""),
- new NameValuePair("TPL_username", taobaoUser),
- new NameValuePair("yparam", "")
- };
- processPost(client, post, TAOBAO_BASE_LOGIN, params, cookies, true, false);
- Header header=post.getResponseHeader("Location");
- String redirectUrl=header.getValue();
- responseString = processGet(client, null, redirectUrl, cookies, true, true);
- responseString=responseString.substring(0,responseString.indexOf("我的彩票"));
- System.out.println("main(String[]) - " + responseString);
- String caiPiaoUrl=responseString.substring(responseString.lastIndexOf("<a")+"<a href='".length(),responseString.lastIndexOf(">")-1);
- processGet(client, null, caiPiaoUrl, cookies, true, false);
- }
- public static String processGet(HttpClient client,GetMethod get,String url,Cookie[] cookies,boolean needAppendCookies,boolean needResponse) throws IOException{
- if(client==null || url==null || url=="") return "";
- if(get==null)
- get=new GetMethod();
- get = new GetMethod(url);
- if(cookies!=null)
- get.setRequestHeader("Cookie" , cookies.toString());
- client.executeMethod(get);
- if(needAppendCookies){
- cookies = client.getState().getCookies();
- client.getState().addCookies(cookies);
- }
- if(needResponse)
- return get.getResponseBodyAsString();
- get.releaseConnection();
- return "";
- }
- public static String processPost(HttpClient client,PostMethod post,String url,NameValuePair[] params,Cookie[] cookies,boolean needAppendCookies,boolean needResponse) throws IOException{
- if(client==null || url==null || url=="") return "";
- if(post==null)
- post = new PostMethod(url);
- if(params!=null && params.length>0)
- post.setRequestBody(params);
- if(cookies!=null)
- post.setRequestHeader("Cookie" , cookies.toString());
- client.executeMethod(post);
- if(needAppendCookies){
- cookies = client.getState().getCookies();
- client.getState().addCookies(cookies);
- }
- if(needResponse)
- return post.getResponseBodyAsString();
- post.releaseConnection();
- return "";
- }
- public static String processDetail(String tempStr,String startFlag,String endFlag,int starts,int ends){
- if(tempStr==null || "".equals(tempStr)) return "";
- int start=tempStr.indexOf(startFlag);
- int end=tempStr.indexOf(endFlag);
- if(start==-1 || end==-1 || (end-ends)<(start+starts)) return "";
- try{
- tempStr=tempStr.substring(start+starts,end-ends);
- }catch(Exception e){
- System.out.println("processDetail(String, String, String, int, int) " + e.toString());
- return "";
- }
- return tempStr;
- }
- }
登陆开心网的例子:
- package com.spider.cron;
- import org.apache.commons.httpclient.Cookie;
- import org.apache.commons.httpclient.HttpClient;
- import org.apache.commons.httpclient.NameValuePair;
- import org.apache.commons.httpclient.methods.GetMethod;
- import org.apache.commons.httpclient.methods.PostMethod;
- public class LoginKaixin {
- private static final String LOGON_SITE = "http://www.kaixin001.com";
- private static final int LOGON_PORT = 80;
- public static void main(String[] args) throws Exception {
- HttpClient client = new HttpClient();
- client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);
- // 登录页面
- PostMethod post = new PostMethod(
- "http://www.kaixin001.com/login/login.php");
- NameValuePair ie = new NameValuePair("User-Agent",
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)");
- NameValuePair url = new NameValuePair("url", "/home/");
- NameValuePair username = new NameValuePair("email", "xxx@163.com");
- NameValuePair password = new NameValuePair("password", "xxxxxx");
- post.setRequestBody(new NameValuePair[] { ie, url, username, password });
- client.executeMethod(post);
- System.out
- .println("******************************登录******************************");
- Cookie[] cookies = client.getState().getCookies();
- client.getState().addCookies(cookies);
- post.releaseConnection();
- System.out
- .println("******************************页面转向******************************");
- String newUrl = "http://www.kaixin001.com/home/";
- System.out.println("==========Cookies============");
- int i = 0;
- for (Cookie c : cookies) {
- System.out.println(++i + ": " + c);
- }
- client.getState().addCookies(cookies);
- post.releaseConnection();
- GetMethod get = new GetMethod(newUrl);
- get.setRequestHeader("Cookie", cookies.toString());
- client.executeMethod(get);
- String responseString = get.getResponseBodyAsString();
- // 登录后首页的内容
- System.out.println(responseString);
- get.releaseConnection();
- System.out
- .println("******************************组件功能******************************");
- // "http://www.kaixin001.com/!slave/index.php", "朋友买卖"
- // "http://www.kaixin001.com/!parking/index.php", "争车位"
- // "http://www.kaixin001.com/!house/index.php?_lgmode=pri", "买房子"
- // http://www.kaixin001.com/!house/index.php?_lgmode=pri&t=49
- // "http://www.kaixin001.com/!house/garden/index.php","花园"
- // (1)进入朋友买卖****************
- System.out
- .println("******************************(1)进入朋友买卖******************************");
- String slave = "http://www.kaixin001.com/!slave/index.php";
- get = new GetMethod(slave);
- get.setRequestHeader("Cookie", cookies.toString());
- client.executeMethod(get);
- responseString = get.getResponseBodyAsString();
- System.out.println(responseString);
- get.releaseConnection();
- // (2)进入争车位****************
- System.out
- .println("******************************(2)进入争车位******************************");
- String parking = "http://www.kaixin001.com/!parking/index.php";
- get = new GetMethod(parking);
- get.setRequestHeader("Cookie", cookies.toString());
- client.executeMethod(get);
- responseString = get.getResponseBodyAsString();
- System.out.println(responseString);
- get.releaseConnection();
- // (3)进入买房子****************
- System.out
- .println("******************************(3)进入买房子*******************************");
- String house = "http://www.kaixin001.com/!house/index.php?_lgmode=pri&t=49";
- get = new GetMethod(house);
- get.setRequestHeader("Cookie", cookies.toString());
- client.executeMethod(get);
- responseString = get.getResponseBodyAsString();
- System.out.println(responseString);
- get.releaseConnection();
- // (4)进入花园****************
- System.out
- .println("******************************(4)进入花园*******************************");
- String garden = "http://www.kaixin001.com/!house/garden/index.php";
- get = new GetMethod(garden);
- get.setRequestHeader("Cookie", cookies.toString());
- client.executeMethod(get);
- responseString = get.getResponseBodyAsString();
- System.out.println(responseString);
- get.releaseConnection();
- }
- }
- 考虑模拟浏览器的爬虫设计
- 02爬虫---浏览器的模拟Headers属性
- Python爬虫002浏览器的模拟Header属性
- 模拟用户行为高匿爬虫的设计开发
- Scrapy爬虫:模拟浏览器和使用代理
- php爬虫模拟浏览器,ini_set()用法
- python爬虫2之简单模拟浏览器
- python urllib2模拟浏览器请求 爬虫
- 爬虫的浏览器伪装
- Python 爬虫之 selenium 爬虫,模拟浏览器爬取天猫信息
- python 针对selenium+phontomjs等模拟浏览器爬虫的反爬技术点
- 数字电路设计的抗干扰考虑
- 网络设计的综合考虑
- 网络设计的综合考虑
- 设计阶段的考虑问题
- 数字电路设计的抗干扰考虑
- 内存设计考虑的要点
- 存储的分类设计考虑
- 获取提升类型和判断进程是否以管理员身份运行
- Lucene实时索引构建
- linux_command_sort
- asp asp.net domino 设计开发爱好者
- Oracle SQL Trace 和 10046 事件
- 考虑模拟浏览器的爬虫设计
- 自动化软件测试——六点小贴士
- 高效利用时间的5个小方法
- iTeXmacs如此之慢
- Nutch+HBase
- 网站静态化方案
- ORACLE 数据集合
- 校外实训周报(十)
- EIP & EBP & ESP