虚拟浏览器(WebClient)应用简单例子

来源:互联网 发布:数据段offset什么意思 编辑:程序博客网 时间:2024/06/05 05:24

WebClient 是一个类似虚拟浏览器的网页抓取包,一个主要特点是适合动态页面的抓取,如Javascript动态生成的网页(Jsoup好像就做不了了)。


首先要引入包,主要是htmlunit,不过这东东包比较散,要运行还要引入一大堆的包,如下:


以下是一个简单的应用例子:

package j2seTest2;import java.net.URL;import com.gargoylesoftware.htmlunit.JavaScriptPage;import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;import com.gargoylesoftware.htmlunit.Page;import com.gargoylesoftware.htmlunit.WebClient;import com.gargoylesoftware.htmlunit.html.HtmlDivision;import com.gargoylesoftware.htmlunit.html.HtmlElement;import com.gargoylesoftware.htmlunit.html.HtmlForm;import com.gargoylesoftware.htmlunit.html.HtmlPage;public class WebClientTest {public static void main(String[] args) {//final String url = "http://tv.cntv.cn/epg";//final String url = "http://weixin.sogou.com/gzh?openid=oIWsFt3aMWa50-g2CZwbXYUqhdpI";final String format = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=%s&page=%s&t=%s";final String openid="oIWsFt9udi0U5dw56-s0dPWW85pM";final String url = String.format(format, openid, "" + 1, System.currentTimeMillis());final WebClient client = new WebClient();client.getOptions().setJavaScriptEnabled(true);// 默认执行jsclient.getOptions().setCssEnabled(false);client.setAjaxController(new NicelyResynchronizingAjaxController());client.getOptions().setThrowExceptionOnScriptError(false);try {Page page = client.getPage(new URL(url));if (page instanceof HtmlPage) {HtmlPage hPage = (HtmlPage) page;System.out.println("~~~HtmlPage");System.out.println(hPage.getTitleText());//HtmlForm form = hPage.getForms().get(0);//HtmlDivision div = (HtmlDivision) form.getByXPath("//div[@id='zhu1']").get(1);//List<HtmlElement> ahtmpr = div.getHtmlElementsByTagName("a");//System.out.println(hPage.asXml());System.out.println(hPage.asText());} else if (page instanceof JavaScriptPage) {JavaScriptPage jPage = (JavaScriptPage) page;System.out.println("~~~JavaScriptPage");System.out.println("statusCode:" + jPage.getWebResponse().getStatusCode());System.out.println(jPage.getContent());}} catch (Exception e) {e.printStackTrace();} finally {client.closeAllWindows();}}}


0 0