HtmlUnit 爬虫简单案例——模拟登陆CSDN

来源:互联网 发布:淘宝虚拟物品手动发货 编辑:程序博客网 时间:2024/06/16 16:47

最近要弄一个爬虫程序,想着先来个简单的模拟登陆, 在权衡JxBrowser和HtmlUnit 两种技术,  JxBowser有界面呈现效果,但是对于某些js跳转之后的效果获取比较繁琐。

随后考虑用HtmlUnit, 想着借用咱们CSND的登陆练练手。谁知道CSDN的登陆,js加载时间超长,不设置长一点的加载时间,按钮提交根本没效果,js没生效。 具体看代码注释吧。 奉劝做爬虫的同志们,千万别用CSDN登陆练手,坑死我了。。。

maven配置如下:

<dependencies><!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit --><dependency><groupId>net.sourceforge.htmlunit</groupId><artifactId>htmlunit</artifactId><version>2.18</version></dependency><!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.9.2</version></dependency></dependencies>


代码如下:

/* * Copyright (c) 2017 Create By Shijing All Rights Reserved. */package com.test;import java.io.IOException;import java.net.MalformedURLException;import java.util.HashMap;import java.util.Map;import java.util.Set;import com.gargoylesoftware.htmlunit.BrowserVersion;import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;import com.gargoylesoftware.htmlunit.SilentCssErrorHandler;import com.gargoylesoftware.htmlunit.WebClient;import com.gargoylesoftware.htmlunit.html.HtmlButtonInput;import com.gargoylesoftware.htmlunit.html.HtmlForm;import com.gargoylesoftware.htmlunit.html.HtmlPage;import com.gargoylesoftware.htmlunit.html.HtmlPasswordInput;import com.gargoylesoftware.htmlunit.html.HtmlTextInput;import com.gargoylesoftware.htmlunit.util.Cookie;public class SimulateLogin{    //访问的目标网址(CSDN)    private static String TARGET_URL = "https://passport.csdn.net/account/login?from=http://www.csdn.net";    public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException    {        // 模拟一个浏览器        WebClient webClient = new WebClient(BrowserVersion.CHROME);        // 设置webClient的相关参数        webClient.setCssErrorHandler(new SilentCssErrorHandler());          //设置ajax        webClient.setAjaxController(new NicelyResynchronizingAjaxController());        //设置支持js        webClient.getOptions().setJavaScriptEnabled(true);        //CSS渲染禁止        webClient.getOptions().setCssEnabled(false);        //超时时间        webClient.getOptions().setTimeout(50000);        //设置js抛出异常:false        webClient.getOptions().setThrowExceptionOnScriptError(false);        //允许重定向        webClient.getOptions().setRedirectEnabled(true);          //允许cookie        webClient.getCookieManager().setCookiesEnabled(true);          // 模拟浏览器打开一个目标网址        HtmlPage page = webClient.getPage(TARGET_URL);        /**等待js加载完全,CSDN这点 特别坑,js加载时间超长!!!!!!! 后人切记不要用CSDN模拟登陆!!!!!!!**/        webClient.waitForBackgroundJavaScript(10000*3);     // 根据form的名字获取页面表单,也可以通过索引来获取:page.getForms().get(0)             HtmlForm form = (HtmlForm) page.getElementById("fm1");                 HtmlTextInput username = (HtmlTextInput) form.getInputByName("username");          HtmlPasswordInput password = (HtmlPasswordInput) form.getInputByName("password");          username.setValueAttribute("********");  //用户名        password.setValueAttribute("********");  //密码        HtmlButtonInput button  = (HtmlButtonInput) page.getByXPath("//input[contains(@class, 'logging')]").get(0);//        ScriptResult result = page.executeJavaScript("javascript:document.getElementsByClassName('logging')[0].click()");//        HtmlPage retPage = (HtmlPage) result.getNewPage();        HtmlPage retPage = button.click();        // 等待JS驱动dom完成获得还原后的网页          webClient.waitForBackgroundJavaScript(1000);          //输出跳转网页的地址        System.out.println(retPage.getUrl().toString());           //输出跳转网页的内容        System.out.println(retPage.asXml());        //获取cookie          Set<Cookie> cookies = webClient.getCookieManager().getCookies();        Map<String, String> responseCookies = new HashMap<String, String>();          for (Cookie c : cookies) {              responseCookies.put(c.getName(), c.getValue());              System.out.print(c.getName()+":"+c.getValue());          }          webClient.close();        System.out.println("Success!");    }}

另外,CSDN的JS总是莫名其妙的报一堆错,如果不想看,想忽略的话,在创建WebClient前加上如下代码

        //设置日志级别,原页面js异常不打印        LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log",    "org.apache.commons.logging.impl.NoOpLog");                  java.util.logging.Logger.getLogger("com.gargoylesoftware.htmlunit")              .setLevel(Level.OFF);            java.util.logging.Logger.getLogger("org.apache.commons.httpclient")              .setLevel(Level.OFF);